Wine Quality¶
Libraries¶
In [1]:
# installing the Libraries
!pip install pandas numpy matplotlib seaborn plotly scikit-learn xgboost lightgbm catboost tensorflow lime shap
#importing the Libs
import pandas as pd
import numpy as np
import time
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# interactive charts
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Data Preparation & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler ,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
# Model Evaluation & Explainability
from sklearn.metrics import accuracy_score, f1_score, classification_report
import lime
import lime.lime_tabular
import shap
# Settings
# ploty for interactive plots for show in github
import plotly.io as pio
pio.renderers.default = 'notebook'
pd.set_option('display.max_columns', None) # display all columns of a DataFrame without truncation
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore') # Could add this,category=UserWarning keep major warnings BUT hide safe user hints
print("All libraries imported successfully!")
Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2) Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2) Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0) Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2) Requirement already satisfied: plotly in /usr/local/lib/python3.11/dist-packages (5.24.1) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.1) Requirement already satisfied: xgboost in /usr/local/lib/python3.11/dist-packages (3.0.4) Requirement already satisfied: lightgbm in /usr/local/lib/python3.11/dist-packages (4.6.0) Requirement already satisfied: catboost in /usr/local/lib/python3.11/dist-packages (1.2.8) Requirement already satisfied: tensorflow in /usr/local/lib/python3.11/dist-packages (2.19.0) Requirement already satisfied: lime in /usr/local/lib/python3.11/dist-packages (0.2.0.1) Requirement already satisfied: shap in /usr/local/lib/python3.11/dist-packages (0.48.0) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.3) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.59.0) Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.9) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (25.0) Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.3.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3) Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly) (9.1.2) Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.16.1) Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.5.1) Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.6.0) Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.23.4) Requirement already satisfied: graphviz in /usr/local/lib/python3.11/dist-packages (from catboost) (0.21) Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from catboost) (1.17.0) Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.4.0) Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.6.3) Requirement already satisfied: flatbuffers>=24.3.25 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (25.2.10) Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.6.0) Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.2.0) Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (18.1.1) Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.4.0) Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (5.29.5) Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.32.3) Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from tensorflow) (75.2.0) Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.1.0) Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (4.14.1) Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.17.3) Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.74.0) Requirement already satisfied: tensorboard~=2.19.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.19.0) Requirement already satisfied: keras>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.10.0) Requirement already satisfied: h5py>=3.11.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.14.0) Requirement already satisfied: ml-dtypes<1.0.0,>=0.5.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.5.3) Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.37.1) Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from lime) (4.67.1) Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.11/dist-packages (from lime) (0.25.2) Requirement already satisfied: slicer==0.0.8 in /usr/local/lib/python3.11/dist-packages (from shap) (0.0.8) Requirement already satisfied: numba>=0.54 in /usr/local/lib/python3.11/dist-packages (from shap) (0.60.0) Requirement already satisfied: cloudpickle in /usr/local/lib/python3.11/dist-packages (from shap) (3.1.1) Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from astunparse>=1.6.0->tensorflow) (0.45.1) Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from keras>=3.5.0->tensorflow) (13.9.4) Requirement already satisfied: namex in /usr/local/lib/python3.11/dist-packages (from keras>=3.5.0->tensorflow) (0.1.0) Requirement already satisfied: optree in /usr/local/lib/python3.11/dist-packages (from keras>=3.5.0->tensorflow) (0.17.0) Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.11/dist-packages (from numba>=0.54->shap) (0.43.0) Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.4.3) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (2.5.0) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (2025.8.3) Requirement already satisfied: networkx>=3.0 in /usr/local/lib/python3.11/dist-packages (from scikit-image>=0.12->lime) (3.5) Requirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.11/dist-packages (from scikit-image>=0.12->lime) (2.37.0) Requirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.11/dist-packages (from scikit-image>=0.12->lime) (2025.6.11) Requirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.11/dist-packages (from scikit-image>=0.12->lime) (0.4) Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.11/dist-packages (from tensorboard~=2.19.0->tensorflow) (3.8.2) Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from tensorboard~=2.19.0->tensorflow) (0.7.2) Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from tensorboard~=2.19.0->tensorflow) (3.1.3) Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.11/dist-packages (from werkzeug>=1.0.1->tensorboard~=2.19.0->tensorflow) (3.0.2) Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->keras>=3.5.0->tensorflow) (4.0.0) Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->keras>=3.5.0->tensorflow) (2.19.2) Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow) (0.1.2) All libraries imported successfully!
Loading the dataset¶
In [2]:
# The separator for this dataset is a semicolon ';', not a comma.
try:
df = pd.read_csv('/content/drive/MyDrive/wine/winequality-white.csv', sep=';')
print("Dataset loaded successfully.")
print("Shape of the dataset:", df.shape)
except FileNotFoundError:
print("Error: 'winequality-white.csv' not found. Please ensure the file is in the correct directory.")
df = pd.DataFrame() # Create an empty dataframe to avoid further errors
# Display the first 5 rows
print("\nFirst 5 rows of the dataset:")
display(df.head())
# Display basic information and check for null values
print("\nDataset Information:")
df.info()
# Display descriptive statistics
print("\nDescriptive Statistics:")
display(df.describe())
Dataset loaded successfully. Shape of the dataset: (4898, 12) First 5 rows of the dataset:
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
Dataset Information: <class 'pandas.core.frame.DataFrame'> RangeIndex: 4898 entries, 0 to 4897 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 4898 non-null float64 1 volatile acidity 4898 non-null float64 2 citric acid 4898 non-null float64 3 residual sugar 4898 non-null float64 4 chlorides 4898 non-null float64 5 free sulfur dioxide 4898 non-null float64 6 total sulfur dioxide 4898 non-null float64 7 density 4898 non-null float64 8 pH 4898 non-null float64 9 sulphates 4898 non-null float64 10 alcohol 4898 non-null float64 11 quality 4898 non-null int64 dtypes: float64(11), int64(1) memory usage: 459.3 KB Descriptive Statistics:
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
Exploratory Data Analysis¶
In [3]:
# Plotting the distribution of the 'quality' variable
fig = px.histogram(df,
x='quality',
title='Distribution of Wine Quality Scores',
color_discrete_sequence=['#4B0082'],
) # Indigo color , size -> width=800,height=800
fig.update_layout(bargap=0.2,
xaxis_title="Wine Quality Score",
yaxis_title="Count")
fig.show()
# It's clear the dataset is imbalanced, with most wines rated 5, 6, or 7.
# This is important for modeling, as accuracy alone can be a misleading metric.
In [4]:
# Calculate the correlation matrix
corr_matrix = df.corr()
# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
z=corr_matrix.values,
x=corr_matrix.columns,
y=corr_matrix.columns,
colorscale='jet',
text= corr_matrix.values,
texttemplate="%{text:.2f}",
textfont={"size":10},
colorbar=dict(title='Correlation')))
fig.update_layout(
title='Correlation Heatmap of Wine Features',
xaxis_tickangle=-45,
xaxis_nticks = 36,
yaxis_nticks = 36,
height=750,
width=850
)
fig.show()
# From the heatmap, 'alcohol' has the strongest positive correlation with quality.
# 'density' has the strongest negative correlation.
In [5]:
# Get the top 3 features most correlated with quality (excluding quality itself)
top_features = df.corr()['quality'].abs().sort_values(ascending=False).index[1:4]
print(f"Top 3 correlated features with quality: {list(top_features)}")
# Using the top features plus 'quality' for coloring
top_features_for_pairplot = list(top_features) + ['quality']
fig = px.scatter_matrix(
df,
dimensions=top_features,
color="quality",
title="Interactive Pairplot of Top Correlated Features",
color_continuous_scale=px.colors.sequential.Viridis
)
fig.update_layout(height=700)
fig.show()
# Observations:
# - Alcohol: Higher quality wines tend to have higher alcohol content.
# - Density: Higher quality wines tend to have lower density.
# - Chlorides: Higher quality wines tend to have lower chloride levels.
Top 3 correlated features with quality: ['alcohol', 'density', 'chlorides']
In [6]:
# Top features vs quality
# Create subplots
fig = make_subplots(rows=1, cols=3, subplot_titles=top_features)
for i, feature in enumerate(top_features):
fig.add_trace(
go.Box(x=df['quality'], y=df[feature], name=feature),
row=1, col=i+1
)
fig.update_layout(title_text="Top 3 Features vs. Wine Quality", showlegend=False)
fig.show()
# Violin
fig = make_subplots(rows=1, cols=3, subplot_titles=top_features)
for i, feature in enumerate(top_features):
fig.add_trace(
go.Violin(x=df['quality'], y=df[feature], name=feature, box_visible=True, meanline_visible=True),
row=1, col=i+1
)
fig.update_layout(title_text="Violin Plots: Top 3 Features vs. Wine Quality", showlegend=False)
fig.show()
In [7]:
# Separate features and target
X = df.drop('quality', axis=1)
y = df['quality']
# Scale the data before PCA
# Adjusts data to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA,Principal Component Analysis
# Dimensionality Reduction , transform data to two uncorrelated features
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Create a DataFrame for plotting
df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df_pca['quality'] = y
# Plot
fig = px.scatter(df_pca, x='PC1', y='PC2', color='quality',
title='PCA of Wine Data (2 Components)',
color_continuous_scale=px.colors.sequential.Viridis,
hover_data=['quality'])
fig.show()
# The plot shows significant overlap between quality classes,
# indicating that a simple linear separation is not possible.
In [8]:
# Determine the number of clusters (same as unique quality scores)
n_clusters = df['quality'].nunique()
# Apply KMeans
# KMeans - unsupervised - clustering data points into predefined number of groups, denoted by 'K'
# n_init - number of times the K-Means algorithm is run with different centroid seeds
# centroid seeds, the central point of a cluster, calculated as the mean of all data points assigned to that cluster
# where each observation belongs to the cluster with the nearest mean, also known as the centroid
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
# Add cluster info to our PCA dataframe
df_pca['kmeans_cluster'] = clusters
# Visualize the clusters
fig = px.scatter(df_pca, x='PC1', y='PC2', color='kmeans_cluster',
title=f'K-Means Clustering Visualization ({n_clusters} Clusters)',
category_orders={"kmeans_cluster": list(range(n_clusters))},
hover_data=['quality'])
fig.show()
# We can see that K-Means finds some structure, but it doesn't perfectly map to the quality scores,
# further confirming the complexity of the problem.
Data Preparation¶
In [9]:
# Define features (X) and target (y)
X = df.drop('quality', axis=1)
y_original = df['quality']
# Encode the labels to be zero-indexed
# why? convert categorical labels into numerical representations,
# which is a necessary step for many machine learning algorithms that require numerical input.
le = LabelEncoder()
y = le.fit_transform(y_original)
# Store the original class names for later interpretation
original_class_names = le.classes_
print(f"Original quality labels: {original_class_names}")
print(f"Encoded labels: {np.unique(y)}")
# Create a dictionary to hold all our datasets
datasets = {}
# 2.1. Original Dataset
# stratify=y - training and testing sets maintain the same proportion of classes as the original dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
datasets['Original'] = (X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy())
print("1. Original dataset created.")
# 2.2. StandardScaler Dataset
# the values are centered around the mean with a unit standard deviation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
datasets['Scaled'] = (X_train_scaled, X_test_scaled, y_train.copy(), y_test.copy())
print("2. StandardScaler dataset created.")
# 2.3. PCA Dataset
# dimensionality reduction technique
pca = PCA(n_components=0.95) # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"3. PCA dataset created with {pca.n_components_} components.")
datasets['PCA'] = (X_train_pca, X_test_pca, y_train.copy(), y_test.copy())
# 2.4. SelectKBest Dataset
# Let's select the top 6 features
k_best = SelectKBest(f_classif, k=6)
X_train_kbest = k_best.fit_transform(X_train_scaled, y_train)
X_test_kbest = k_best.transform(X_test_scaled)
selected_features = X.columns[k_best.get_support()]
print(f"4. SelectKBest dataset created with features: {list(selected_features)}")
datasets['KBest'] = (X_train_kbest, X_test_kbest, y_train.copy(), y_test.copy())
# 2.5. Outlier Removal Dataset
# Anomaly detection By IsolationForest
iso_forest = IsolationForest(contamination='auto', random_state=42)
outliers = iso_forest.fit_predict(X_train_scaled)
X_train_no_outliers = X_train_scaled[outliers == 1]
y_train_no_outliers = y_train[outliers == 1]
print(f"5. Outlier Removal dataset created. Removed {np.sum(outliers == -1)} outliers.")
# We use the original scaled test set for fair comparison
datasets['NoOutliers'] = (X_train_no_outliers, X_test_scaled, y_train_no_outliers, y_test.copy())
print("\nAll datasets are prepared and ready for modeling.")
Original quality labels: [3 4 5 6 7 8 9] Encoded labels: [0 1 2 3 4 5 6] 1. Original dataset created. 2. StandardScaler dataset created. 3. PCA dataset created with 9 components. 4. SelectKBest dataset created with features: ['volatile acidity', 'residual sugar', 'chlorides', 'total sulfur dioxide', 'density', 'alcohol'] 5. Outlier Removal dataset created. Removed 236 outliers. All datasets are prepared and ready for modeling.
MODELING¶
In [10]:
# Defining the simple neural network
# input_shape is number of features
# num_classes is the number of possible outcomes to predict
# Sequential is simplest kind of keras model
# 128 is the number of neurons
# relu is learn complex patterns
# activation='softmax' - multi-class classification , raw output to probabilities
# compile is for training
# adam is effective optimizer - internal weights based on the training data - Adaptive Moment Estimation
# optimizer is an algorithm used to adjust the weights and biases of a neural network during the training process
# categorical_crossentropy it measures how well model predicted
# metrics=['accuracy'] - calculate accuracy during training
def create_nn(input_shape, num_classes):
model = Sequential([
Dense(128, activation='relu', input_shape=input_shape), # first Desnse layer
Dropout(0.3), # prevent overfitting , 0.3 neuron is active
Dense(64, activation='relu'),# second layer
Dropout(0.3),
Dense(num_classes, activation='softmax')#final output layer
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model
# Logistic Regression - classification - linear regressions + sigmoid
# K-Nearest Neighbors - make predictions based on nearest samples
# SVC - find best boundary to seprate diffrent classes
# Random forest - group of decision trees
# Gradient Boosting - each new tree try to fix the mistake of previous tree
# XGBoost optimized and improved vesion of Gradient boosting - faster - more efficient
# faster more memory efficient for working very large dataset
# Catboost working with catgorical features no need for one-hot encoding
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42), # max_iter = 1000 to reach a stable solution
"K-Nearest Neighbors": KNeighborsClassifier(),
"Support Vector Machine": SVC(random_state=42, probability=True), # Enable probability for LIME
"Random Forest": RandomForestClassifier(random_state=42),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
"LightGBM": LGBMClassifier(random_state=42),
"CatBoost": CatBoostClassifier(random_state=42, verbose=0),
"Neural Network": "NN_PLACEHOLDER" # Special case
}
# Store results
results = []
if 'datasets' in locals():
# Loop through each dataset
for d_name, (X_tr, X_te, y_tr, y_te) in datasets.items():
print(f"--- Processing Dataset: {d_name} ---")
# Loop through each model
for m_name, model in models.items():
start_time = time.time()
if m_name == "Neural Network":
# Handle NN separately
input_shape = (X_tr.shape[1],)
num_classes = len(np.unique(y_tr))
# Adjust y for NN
y_tr_nn_current = to_categorical(y_tr, num_classes=len(original_class_names))
y_te_nn_current = to_categorical(y_te, num_classes=len(original_class_names))
nn_model = create_nn(input_shape, len(original_class_names))
nn_model.fit(X_tr, y_tr_nn_current, epochs=20, batch_size=32, verbose=0)
y_pred_probs = nn_model.predict(X_te)
y_pred = np.argmax(y_pred_probs, axis=1)
else:
# For all other models
model.fit(X_tr, y_tr)
y_pred = model.predict(X_te)
end_time = time.time()
# Calculate metrics
train_time = end_time - start_time
accuracy = accuracy_score(y_te, y_pred)
f1 = f1_score(y_te, y_pred, average='weighted')
# Store results
results.append({
'Dataset': d_name,
'Model': m_name,
'Accuracy': accuracy,
'F1 Score': f1,
'Training Time (s)': train_time
})
print(f" {m_name}: F1 Score = {f1:.4f}, Time = {train_time:.2f}s")
# Create a DataFrame from the results
results_df = pd.DataFrame(results)
print("\n--- Model Comparison Complete ---")
display(results_df.sort_values(by='F1 Score', ascending=False).head(10))
--- Processing Dataset: Original --- Logistic Regression: F1 Score = 0.4999, Time = 2.08s K-Nearest Neighbors: F1 Score = 0.4332, Time = 0.03s Support Vector Machine: F1 Score = 0.2840, Time = 5.12s Random Forest: F1 Score = 0.6685, Time = 1.11s Gradient Boosting: F1 Score = 0.5620, Time = 12.72s XGBoost: F1 Score = 0.6459, Time = 15.70s [LightGBM] [Warning] Found whitespace in feature_names, replace with underlines [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000802 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 1323 [LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11 [LightGBM] [Info] Start training from score -5.500748 [LightGBM] [Info] Start training from score -3.405802 [LightGBM] [Info] Start training from score -1.212002 [LightGBM] [Info] Start training from score -0.801405 [LightGBM] [Info] Start training from score -1.716558 [LightGBM] [Info] Start training from score -3.331694 [LightGBM] [Info] Start training from score -6.887042 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf LightGBM: F1 Score = 0.6297, Time = 1.61s CatBoost: F1 Score = 0.6240, Time = 34.30s 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step Neural Network: F1 Score = 0.2782, Time = 12.74s --- Processing Dataset: Scaled --- Logistic Regression: F1 Score = 0.5155, Time = 0.24s K-Nearest Neighbors: F1 Score = 0.5191, Time = 0.12s Support Vector Machine: F1 Score = 0.5328, Time = 4.23s Random Forest: F1 Score = 0.6644, Time = 1.07s Gradient Boosting: F1 Score = 0.5611, Time = 6.74s XGBoost: F1 Score = 0.6459, Time = 1.98s [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000460 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 1342 [LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11 [LightGBM] [Info] Start training from score -5.500748 [LightGBM] [Info] Start training from score -3.405802 [LightGBM] [Info] Start training from score -1.212002 [LightGBM] [Info] Start training from score -0.801405 [LightGBM] [Info] Start training from score -1.716558 [LightGBM] [Info] Start training from score -3.331694 [LightGBM] [Info] Start training from score -6.887042 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf LightGBM: F1 Score = 0.6307, Time = 0.88s CatBoost: F1 Score = 0.6240, Time = 13.38s 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step Neural Network: F1 Score = 0.5281, Time = 7.25s --- Processing Dataset: PCA --- Logistic Regression: F1 Score = 0.4973, Time = 0.12s K-Nearest Neighbors: F1 Score = 0.5345, Time = 0.07s Support Vector Machine: F1 Score = 0.5204, Time = 5.63s Random Forest: F1 Score = 0.6459, Time = 1.64s Gradient Boosting: F1 Score = 0.5448, Time = 13.28s XGBoost: F1 Score = 0.6202, Time = 1.24s [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000370 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 2295 [LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 9 [LightGBM] [Info] Start training from score -5.500748 [LightGBM] [Info] Start training from score -3.405802 [LightGBM] [Info] Start training from score -1.212002 [LightGBM] [Info] Start training from score -0.801405 [LightGBM] [Info] Start training from score -1.716558 [LightGBM] [Info] Start training from score -3.331694 [LightGBM] [Info] Start training from score -6.887042 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf LightGBM: F1 Score = 0.6239, Time = 1.10s CatBoost: F1 Score = 0.6414, Time = 17.98s 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step Neural Network: F1 Score = 0.5454, Time = 9.18s --- Processing Dataset: KBest --- Logistic Regression: F1 Score = 0.4982, Time = 0.09s K-Nearest Neighbors: F1 Score = 0.5184, Time = 0.04s Support Vector Machine: F1 Score = 0.4976, Time = 4.41s Random Forest: F1 Score = 0.6355, Time = 4.31s Gradient Boosting: F1 Score = 0.5508, Time = 4.72s XGBoost: F1 Score = 0.6149, Time = 0.58s [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 957 [LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 6 [LightGBM] [Info] Start training from score -5.500748 [LightGBM] [Info] Start training from score -3.405802 [LightGBM] [Info] Start training from score -1.212002 [LightGBM] [Info] Start training from score -0.801405 [LightGBM] [Info] Start training from score -1.716558 [LightGBM] [Info] Start training from score -3.331694 [LightGBM] [Info] Start training from score -6.887042 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf LightGBM: F1 Score = 0.6060, Time = 0.77s CatBoost: F1 Score = 0.5983, Time = 11.58s 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step Neural Network: F1 Score = 0.5055, Time = 9.11s --- Processing Dataset: NoOutliers --- Logistic Regression: F1 Score = 0.5087, Time = 0.24s K-Nearest Neighbors: F1 Score = 0.5107, Time = 0.15s Support Vector Machine: F1 Score = 0.5263, Time = 4.77s Random Forest: F1 Score = 0.6578, Time = 1.00s Gradient Boosting: F1 Score = 0.5620, Time = 5.85s XGBoost: F1 Score = 0.6298, Time = 0.75s [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000142 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 1231 [LightGBM] [Info] Number of data points in the train set: 3682, number of used features: 11 [LightGBM] [Info] Start training from score -6.013987 [LightGBM] [Info] Start training from score -3.576482 [LightGBM] [Info] Start training from score -1.237668 [LightGBM] [Info] Start training from score -0.784662 [LightGBM] [Info] Start training from score -1.678877 [LightGBM] [Info] Start training from score -3.336014 [LightGBM] [Info] Start training from score -6.824917 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf LightGBM: F1 Score = 0.6276, Time = 1.25s CatBoost: F1 Score = 0.6233, Time = 12.36s 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step Neural Network: F1 Score = 0.5360, Time = 14.51s --- Model Comparison Complete ---
| Dataset | Model | Accuracy | F1 Score | Training Time (s) | |
|---|---|---|---|---|---|
| 3 | Original | Random Forest | 0.679592 | 0.668460 | 1.108600 |
| 12 | Scaled | Random Forest | 0.675510 | 0.664390 | 1.067936 |
| 39 | NoOutliers | Random Forest | 0.670408 | 0.657760 | 0.998529 |
| 21 | PCA | Random Forest | 0.656122 | 0.645927 | 1.637491 |
| 5 | Original | XGBoost | 0.654082 | 0.645863 | 15.695061 |
| 14 | Scaled | XGBoost | 0.654082 | 0.645863 | 1.983500 |
| 25 | PCA | CatBoost | 0.646939 | 0.641398 | 17.976503 |
| 30 | KBest | Random Forest | 0.643878 | 0.635521 | 4.312970 |
| 15 | Scaled | LightGBM | 0.638776 | 0.630696 | 0.879450 |
| 41 | NoOutliers | XGBoost | 0.637755 | 0.629836 | 0.751796 |
Compare models¶
In [11]:
if 'results_df' in locals():
# Find the best combination
best_result = results_df.loc[results_df['F1 Score'].idxmax()]
print(f"Best Combination:\n{best_result}")
# Plot F1 Scores
fig1 = px.bar(results_df, x='Model', y='F1 Score', color='Dataset',
barmode='group',
title='Model F1 Scores by Dataset Preparation Technique',
labels={'F1 Score': 'F1 Score (Weighted)', 'Model': 'Machine Learning Model'})
fig1.update_layout(xaxis_tickangle=-45)
fig1.show()
# Plot Training Times
fig2 = px.bar(results_df, x='Model', y='Training Time (s)', color='Dataset',
barmode='group',
title='Model Training Time by Dataset Preparation Technique',
labels={'Training Time (s)': 'Time (seconds)', 'Model': 'Machine Learning Model'})
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()
Best Combination: Dataset Original Model Random Forest Accuracy 0.679592 F1 Score 0.66846 Training Time (s) 1.1086 Name: 3, dtype: object
SHAP and LIME¶
In [12]:
if 'results_df' in locals() and 'datasets' in locals():
# --- Setup for Explainability ---
# We will use the best model identified previously.
best_result = results_df.loc[results_df['F1 Score'].idxmax()]
best_model_name = best_result['Model']
best_dataset_name = best_result['Dataset']
X_train_best, X_test_best, y_train_best, y_test_best = datasets[best_dataset_name]
# Determine the correct feature names based on the dataset used
if best_dataset_name in ['Original', 'Scaled', 'NoOutliers']:
correct_feature_names = df.drop('quality', axis=1).columns.tolist()
elif best_dataset_name == 'KBest':
correct_feature_names = selected_features.tolist()
elif best_dataset_name == 'PCA':
correct_feature_names = [f'PC{i+1}' for i in range(X_test_best.shape[1])]
else: # Fallback
correct_feature_names = [f'feature_{i}' for i in range(X_test_best.shape[1])]
print(f"Using feature names for SHAP/LIME: {correct_feature_names}")
# Re-train the best model
if best_model_name == "Neural Network":
print("Neural Network was the best model. SHAP TreeExplainer is not compatible. Skipping explainability.")
else:
# Instantiate a new model object
best_model = models[best_model_name]
best_model.fit(X_train_best, y_train_best)
print(f"Explaining the '{best_model_name}' model trained on the '{best_dataset_name}' dataset.")
# --- 1. SHAP (SHapley Additive exPlanations) ---
print("\n--- Running SHAP Analysis ---")
explainer_shap = shap.TreeExplainer(best_model)
shap_values = explainer_shap.shap_values(X_test_best)
# Global Feature Importance Plot (Summary Bar Plot)
print("Displaying SHAP Summary Bar Plot (Global Feature Importance)...")
shap.summary_plot(shap_values, X_test_best, plot_type="bar", class_names=original_class_names, feature_names=correct_feature_names)
# Beeswarm plot for all classes
print("\nDisplaying SHAP Beeswarm Plot for all classes...")
plt.title("SHAP Beeswarm Plot (All Classes)")
shap.summary_plot(shap_values, X_test_best, feature_names=correct_feature_names, class_names=original_class_names)
# --- Interactive Force Plot ---
print("\nInitializing SHAP JavaScript visualization...")
shap.initjs()
# Choose a class to explain, for example, the class for quality '6'
quality_to_explain = 6
# Find the index corresponding to our chosen quality score
class_index_to_explain = np.where(le.classes_ == quality_to_explain)[0][0]
print(f"Displaying interactive SHAP Force Plot for Quality = {quality_to_explain} (class index {class_index_to_explain})...")
# *** DEFINITIVE FIX: Handle both list and 3D array formats for shap_values ***
shap_values_for_class = None
expected_value_for_class = None
if isinstance(shap_values, list):
# Format: List of arrays (e.g., from RandomForest)
shap_values_for_class = shap_values[class_index_to_explain]
expected_value_for_class = explainer_shap.expected_value[class_index_to_explain]
else:
# Format: 3D array (e.g., from LightGBM) of shape (n_samples, n_features, n_classes)
shap_values_for_class = shap_values[:, :, class_index_to_explain]
expected_value_for_class = explainer_shap.expected_value[class_index_to_explain]
# This creates a stable, interactive plot for all instances in the test set for the chosen class.
display(shap.force_plot(expected_value_for_class,
shap_values_for_class,
pd.DataFrame(X_test_best, columns=correct_feature_names)))
# --- 2. LIME (Local Interpretable Model-agnostic Explanations) ---
print("\n--- Running LIME Analysis ---")
# Convert the training data DataFrame to a NumPy array
# LIME needs a numpy array for the training_data parameter.
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
training_data=X_train_best.to_numpy(), # Use .to_numpy() here
feature_names=correct_feature_names,
class_names=[str(c) for c in original_class_names],
mode='classification'
)
instance_idx_lime = 5
print(f"\nExplaining prediction for instance #{instance_idx_lime} with LIME...")
# --- FIX 2: Select the row correctly and convert it to a NumPy array ---
# Use .iloc to select the row by its integer position, then convert to numpy.
instance_to_explain = X_test_best.iloc[instance_idx_lime].to_numpy()
explanation = explainer_lime.explain_instance(
instance_to_explain, # Pass the converted numpy array
best_model.predict_proba,
num_features=len(correct_feature_names),
top_labels=1
)
explanation.show_in_notebook(show_table=True, show_all=False)
print("DONE")
Using feature names for SHAP/LIME: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] Explaining the 'Random Forest' model trained on the 'Original' dataset. --- Running SHAP Analysis --- Displaying SHAP Summary Bar Plot (Global Feature Importance)...
Displaying SHAP Beeswarm Plot for all classes...
Initializing SHAP JavaScript visualization...
Displaying interactive SHAP Force Plot for Quality = 6 (class index 3)...
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
--- Running LIME Analysis --- Explaining prediction for instance #5 with LIME...
DONE
In [13]:
!jupyter nbconvert --to notebook --execute --inplace /content/Wine_Quality.ipynb
[NbConvertApp] WARNING | pattern '/content/Wine_Quality.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--coalesce-streams
Coalesce consecutive stdout and stderr outputs into one stream (within each cell).
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --CoalesceStreamsPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
Whether the HTML in Markdown cells and cell outputs should be sanitized..
Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
Whether the HTML in Markdown cells and cell outputs should be sanitized.This
should be set to True by nbviewer or similar tools.
Default: False
Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
Overwrite base name use for output files.
Supports pattern replacements '{notebook_name}'.
Default: '{notebook_name}'
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.
In [14]:
!jupyter nbconvert --to html /content/Wine_Quality.ipynb
[NbConvertApp] WARNING | pattern '/content/Wine_Quality.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--coalesce-streams
Coalesce consecutive stdout and stderr outputs into one stream (within each cell).
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --CoalesceStreamsPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
Whether the HTML in Markdown cells and cell outputs should be sanitized..
Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
Whether the HTML in Markdown cells and cell outputs should be sanitized.This
should be set to True by nbviewer or similar tools.
Default: False
Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
Overwrite base name use for output files.
Supports pattern replacements '{notebook_name}'.
Default: '{notebook_name}'
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.
In [14]: